ONLINE PAYMENT FRAUD DETECTION WITH MACHINE LEARNING¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
Data=pd.read_csv("/Users/mac/Downloads/online fraud payment.csv")
Data.head()
Out[2]:
step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud
0 1 PAYMENT 9839.64 C1231006815 170136.0 160296.36 M1979787155 0.0 0.0 0 0
1 1 PAYMENT 1864.28 C1666544295 21249.0 19384.72 M2044282225 0.0 0.0 0 0
2 1 TRANSFER 181.00 C1305486145 181.0 0.00 C553264065 0.0 0.0 1 0
3 1 CASH_OUT 181.00 C840083671 181.0 0.00 C38997010 21182.0 0.0 1 0
4 1 PAYMENT 11668.14 C2048537720 41554.0 29885.86 M1230701703 0.0 0.0 0 0
In [3]:
Data.shape
Out[3]:
(6362620, 11)
In [4]:
Data.tail()
Out[4]:
step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud
6362615 743 CASH_OUT 339682.13 C786484425 339682.13 0.0 C776919290 0.00 339682.13 1 0
6362616 743 TRANSFER 6311409.28 C1529008245 6311409.28 0.0 C1881841831 0.00 0.00 1 0
6362617 743 CASH_OUT 6311409.28 C1162922333 6311409.28 0.0 C1365125890 68488.84 6379898.11 1 0
6362618 743 TRANSFER 850002.52 C1685995037 850002.52 0.0 C2080388513 0.00 0.00 1 0
6362619 743 CASH_OUT 850002.52 C1280323807 850002.52 0.0 C873221189 6510099.11 7360101.63 1 0
In [5]:
Data.isna().sum()
Out[5]:
step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64
In [6]:
Data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB

so given dataset contains no null and nan values

before moving forward we need to find type transaction on given data set

In [7]:
Data.type.value_counts()
Out[7]:
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: type, dtype: int64
In [8]:
Data['step'].duplicated().value_counts()
Out[8]:
True     6361877
False        743
Name: step, dtype: int64
In [9]:
Data.drop_duplicates()
Data.head()
Out[9]:
step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud
0 1 PAYMENT 9839.64 C1231006815 170136.0 160296.36 M1979787155 0.0 0.0 0 0
1 1 PAYMENT 1864.28 C1666544295 21249.0 19384.72 M2044282225 0.0 0.0 0 0
2 1 TRANSFER 181.00 C1305486145 181.0 0.00 C553264065 0.0 0.0 1 0
3 1 CASH_OUT 181.00 C840083671 181.0 0.00 C38997010 21182.0 0.0 1 0
4 1 PAYMENT 11668.14 C2048537720 41554.0 29885.86 M1230701703 0.0 0.0 0 0
In [10]:
Data.shape
Out[10]:
(6362620, 11)

Data visualization¶

In [11]:
TYPE=Data["type"].value_counts()
transactions=TYPE.index
quantity=TYPE.values

import plotly.express as px
figure=px.pie(Data,
             values=quantity,
             names=transactions,
             title='Distribution of Transaction Type')
figure.show()

Correlation¶

In [12]:
correlation=Data.corr()
print(correlation['isFraud'].sort_values(ascending=False))
isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64

correlation in Heatmap visulization¶

In [13]:
from seaborn import heatmap
heatmap(Data.corr())
Out[13]:
<AxesSubplot:>
In [14]:
Data["type"] = Data["type"].replace({"CASH_OUT" : 1, "PAYMENT" : 2, "CASH_IN" : 3,"TRANSFER": 4,"DEBIT": 5})
#in this step we changeing VALUES on 'type' COLUMN OF DATASET that which we are replacing the values by 
#1 AS "CASH_OUT"
#2 AS "PAYMENT"
#3 AS "CASH_IN"
#4 AS "TRANSFER"
#5 AS "DEBIT"
In [15]:
Data["isFraud"] = Data["isFraud"].replace({0: "NO Fraud", 1: "Fraud"})
# same like previous set we replacing the values like in "isfraud"
# 0 as "NO FRAUD"
# 1 as "FRAUD"
In [16]:
Data.head(10)
# why we use head again because we are checking the valuse we replace are correct or not 
#check the column of 'type' and 'isfraud' 
Out[16]:
step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud
0 1 2 9839.64 C1231006815 170136.00 160296.36 M1979787155 0.0 0.00 NO Fraud 0
1 1 2 1864.28 C1666544295 21249.00 19384.72 M2044282225 0.0 0.00 NO Fraud 0
2 1 4 181.00 C1305486145 181.00 0.00 C553264065 0.0 0.00 Fraud 0
3 1 1 181.00 C840083671 181.00 0.00 C38997010 21182.0 0.00 Fraud 0
4 1 2 11668.14 C2048537720 41554.00 29885.86 M1230701703 0.0 0.00 NO Fraud 0
5 1 2 7817.71 C90045638 53860.00 46042.29 M573487274 0.0 0.00 NO Fraud 0
6 1 2 7107.77 C154988899 183195.00 176087.23 M408069119 0.0 0.00 NO Fraud 0
7 1 2 7861.64 C1912850431 176087.23 168225.59 M633326333 0.0 0.00 NO Fraud 0
8 1 2 4024.36 C1265012928 2671.00 0.00 M1176932104 0.0 0.00 NO Fraud 0
9 1 5 5337.77 C712410124 41720.00 36382.23 C195600860 41898.0 40348.79 NO Fraud 0
In [17]:
from sklearn.model_selection import train_test_split
x = np.array(Data[["step","type", "amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(Data[["isFraud"]])

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
In [19]:
model1=DecisionTreeClassifier()
In [20]:
model2=LogisticRegression()
In [21]:
model3=KNeighborsClassifier()
In [22]:
model4=RandomForestClassifier(n_estimators=150,criterion='entropy')
In [23]:
model1.fit(x_train, y_train.ravel())
Out[23]:
DecisionTreeClassifier()
In [24]:
model2.fit(x_train, y_train.ravel())
Out[24]:
LogisticRegression()
In [25]:
model3.fit(x_train, y_train.ravel())
Out[25]:
KNeighborsClassifier()
In [26]:
model4.fit(x_train, y_train.ravel())
Out[26]:
RandomForestClassifier(criterion='entropy', n_estimators=150)
In [27]:
print('DecisionTreeClassifier:',model1.score(x_test, y_test))
print('LogisticRegression',model2.score(x_test, y_test))
print('KNeighborsClassifier',model3.score(x_test, y_test))
print('RandomForestClassifier',model4.score(x_test, y_test))
DecisionTreeClassifier: 0.9996000075440621
LogisticRegression 0.9981509189610569
KNeighborsClassifier 0.9996314411358843
RandomForestClassifier 0.9996793773634132
In [28]:
Data.head()
Out[28]:
step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud
0 1 2 9839.64 C1231006815 170136.0 160296.36 M1979787155 0.0 0.0 NO Fraud 0
1 1 2 1864.28 C1666544295 21249.0 19384.72 M2044282225 0.0 0.0 NO Fraud 0
2 1 4 181.00 C1305486145 181.0 0.00 C553264065 0.0 0.0 Fraud 0
3 1 1 181.00 C840083671 181.0 0.00 C38997010 21182.0 0.0 Fraud 0
4 1 2 11668.14 C2048537720 41554.0 29885.86 M1230701703 0.0 0.0 NO Fraud 0
In [29]:
Data.tail()
Out[29]:
step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud
6362615 743 1 339682.13 C786484425 339682.13 0.0 C776919290 0.00 339682.13 Fraud 0
6362616 743 4 6311409.28 C1529008245 6311409.28 0.0 C1881841831 0.00 0.00 Fraud 0
6362617 743 1 6311409.28 C1162922333 6311409.28 0.0 C1365125890 68488.84 6379898.11 Fraud 0
6362618 743 4 850002.52 C1685995037 850002.52 0.0 C2080388513 0.00 0.00 Fraud 0
6362619 743 1 850002.52 C1280323807 850002.52 0.0 C873221189 6510099.11 7360101.63 Fraud 0
In [30]:
features = np.array([[ 1, 4, 181.00, 181.00, 0.0]])
print(model3.predict(features))
['Fraud']
In [ ]: